knitr::opts_chunk$set(echo = TRUE)
library(reticulate)
use_python("/Users/oldemarrodriguez/anaconda3/bin/python3.7") # PROMIDAT
# use_python("/anaconda3/bin/python3.6") ## Portátil
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import numpy as np
from math import pi
Para mostrar un gráfico en un archivo rmd use show() y close()
def open_close_plot():
plt.show()
plt.close()
def bar_plot(centros, labels, cluster = None, var = None):
from math import ceil, floor
from seaborn import color_palette
colores = color_palette()
minimo = floor(centros.min()) if floor(centros.min()) < 0 else 0
def inside_plot(valores, labels, titulo):
plt.barh(range(len(valores)), valores, 1/1.5, color = colores)
plt.xlim(minimo, ceil(centros.max()))
plt.title(titulo)
if var is not None:
centros = np.array([n[[x in var for x in labels]] for n in centros])
colores = [colores[x % len(colores)] for x, i in enumerate(labels) if i in var]
labels = labels[[x in var for x in labels]]
if cluster is None:
for i in range(centros.shape[0]):
plt.subplot(1, centros.shape[0], i + 1)
inside_plot(centros[i].tolist(), labels, ('Cluster ' + str(i)))
plt.yticks(range(len(labels)), labels) if i == 0 else plt.yticks([])
else:
pos = 1
for i in cluster:
plt.subplot(1, len(cluster), pos)
inside_plot(centros[i].tolist(), labels, ('Cluster ' + str(i)))
plt.yticks(range(len(labels)), labels) if pos == 1 else plt.yticks([])
pos += 1
def radar_plot(centros, labels):
from math import pi
centros = np.array([((n - min(n)) / (max(n) - min(n)) * 100) if
max(n) != min(n) else (n/n * 50) for n in centros.T])
angulos = [n / float(len(labels)) * 2 * pi for n in range(len(labels))]
angulos += angulos[:1]
ax = plt.subplot(111, polar = True)
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)
plt.xticks(angulos[:-1], labels)
ax.set_rlabel_position(0)
plt.yticks([10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
["10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "100%"],
color = "grey", size = 8)
plt.ylim(-10, 100)
for i in range(centros.shape[1]):
valores = centros[:, i].tolist()
valores += valores[:1]
ax.plot(angulos, valores, linewidth = 1, linestyle = 'solid',
label = 'Cluster ' + str(i))
ax.fill(angulos, valores, alpha = 0.3)
plt.legend(loc='upper right', bbox_to_anchor = (0.1, 0.1))
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
print(os.getcwd())
/Users/oldemarrodriguez/Google Drive/MDCurso/Datos
datos = pd.read_csv('EjemploEstudiantes.csv',delimiter=';',decimal=",",index_col=0)
print(datos)
Matematicas Ciencias Espanol Historia EdFisica
Lucia 7.0 6.5 9.2 8.6 8.0
Pedro 7.5 9.4 7.3 7.0 7.0
Ines 7.6 9.2 8.0 8.0 7.5
Luis 5.0 6.5 6.5 7.0 9.0
Andres 6.0 6.0 7.8 8.9 7.3
Ana 7.8 9.6 7.7 8.0 6.5
Carlos 6.3 6.4 8.2 9.0 7.2
Jose 7.9 9.7 7.5 8.0 6.0
Sonia 6.0 6.0 6.5 5.5 8.7
Maria 6.8 7.2 8.7 9.0 7.0
print(datos.shape)
(10, 5)
Ejecuta k-medias con 3 clusters
kmedias = KMeans(n_clusters=3)
kmedias.fit(datos)
print(kmedias.predict(datos))
[1 0 0 2 1 0 1 0 2 1]
centros = np.array(kmedias.cluster_centers_)
print(centros)
[[7.7 9.475 7.625 7.75 6.75 ]
[6.525 6.525 8.475 8.875 7.375]
[5.5 6.25 6.5 6.25 8.85 ]]
plt.figure(1, figsize = (12, 8))
bar_plot(centros, datos.columns)
open_close_plot()
plt.figure(1, figsize = (10, 10))
radar_plot(centros, datos.columns)
open_close_plot()
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
iris = pd.read_csv('iris.csv',delimiter=';',decimal=".")
print(iris.head())
s.largo s.ancho p.largo p.ancho tipo
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
print(iris.shape)
(150, 5)
Ejecuta k-medias con 3 clusters
kmedias = KMeans(n_clusters=3)
iris_tempo=iris.iloc[:,:4]
print(iris_tempo.head())
s.largo s.ancho p.largo p.ancho
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
kmedias.fit(iris_tempo)
print(kmedias.predict(iris_tempo))
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 0 2 0 0 0 0
0 0 2 2 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0 0 0 2 0
0 2]
Los dibuja sobre el plano principal
pca = PCA(n_components=2)
componentes = pca.fit_transform(iris_tempo)
print(componentes[:6,:])
[[-2.68420713 0.32660731]
[-2.71539062 -0.16955685]
[-2.88981954 -0.13734561]
[-2.7464372 -0.31112432]
[-2.72859298 0.33392456]
[-2.27989736 0.74778271]]
print(componentes.shape)
(150, 2)
plt.scatter(componentes[:, 0], componentes[:, 1],c=kmedias.predict(iris_tempo))
plt.xlabel('componente 1')
plt.ylabel('componente 2')
plt.title('3 Cluster K-Medias')
open_close_plot()
plt.figure(1, figsize = (12, 8))
bar_plot(centros, datos.columns)
open_close_plot()
plt.figure(1, figsize = (10, 10))
radar_plot(centros, datos.columns)
open_close_plot()
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
datos = pd.read_csv('EjemploClientesCorregidaEdad.csv',delimiter=';',decimal=",",index_col=0)
print(datos.head())
Edad/10 ... Calidad Servicio
Nombre Cliente ...
Ariana 2.5 ... 6.6
Guiselle 2.4 ... 5.4
Francisco 2.8 ... 8.5
Griselda 2.3 ... 5.4
Damaris 4.9 ... 3.3
[5 rows x 12 columns]
print(datos.shape)
(37, 12)
datos.dropna().describe()
Ejecuta k-medias con 2 clusters
kmedias = KMeans(n_clusters=2)
kmedias.fit(datos)
print(kmedias.predict(datos))
[0 0 0 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 0 1 0 1 0 1 1 1 1 0 0 1 0 0 0 1 1 0 0]
centros = np.array(kmedias.cluster_centers_)
print(centros)
[[2.60555556 2.22222222 6.61111111 7.98888889 7.92222222 9.63333333
3.92777778 9.14444444 7.54444444 7.66666667 8.56666667 5.45 ]
[3.14210526 4.26315789 5.65263158 7.81052632 5.22105263 9.65263158
2.86315789 8.86315789 5.87368421 6.07368421 5.66315789 4.78421053]]
plt.figure(1, figsize = (12, 8))
bar_plot(centros, datos.columns)
open_close_plot()
plt.figure(1, figsize = (10, 10))
radar_plot(centros, datos.columns)
open_close_plot()
Inercia Interclases
Nc = range(1, 20)
kmediasList = [KMeans(n_clusters=i) for i in Nc]
varianza = [kmediasList[i].fit(datos).score(datos) for i in range(len(kmediasList))]
plt.plot(Nc,varianza,'o-')
plt.xlabel('Número de clústeres')
plt.ylabel('Varianza explicada por cada cluster (Inercia Interclases)')
plt.title('Codo de Jambu')
open_close_plot()
Inercia Intraclases
Nc = range(1, 20)
kmediasList = [KMeans(n_clusters=i) for i in Nc]
varianza = [kmediasList[i].fit(datos).inertia_ for i in range(len(kmediasList))]
plt.plot(Nc,varianza,'o-')
plt.xlabel('Número de clústeres')
plt.ylabel('Varianza explicada por cada cluster (Inercia Intraclases)')
plt.title('Codo de Jambu')
plt.show()
En este caso no hay mucha claridad, K=3 o K=7 son buenas opciones
Ejecuta k-medias con 7 clusters
kmedias = KMeans(n_clusters=7)
kmedias.fit(datos)
print(kmedias.predict(datos))
[0 0 5 6 1 4 0 4 4 0 3 4 0 3 1 4 4 5 0 4 5 1 2 2 4 3 2 0 0 3 5 5 5 2 4 0 0]
centros = np.array(kmedias.cluster_centers_)
print(centros)
[[ 2.36 0.6 6.22 8.34 8.12 9.7
4.7 9.16 7.36 7.44 8.52 5.07 ]
[ 3.86666667 5.33333333 6.06666667 6.26666667 3.2 9.9
4.1 9.06666667 3.26666667 6.13333333 5. 2.93333333]
[ 2.75 0.5 4.9 7.8 4.05 9.425
0.775 9.65 7.35 6.5 6.75 3.925 ]
[ 3.25 8. 6.1 8.2 7.4 9.825
4.15 9.1 7.2 6.85 7.25 3.175 ]
[ 3.14444444 4.11111111 6.15555556 7.88888889 5.08888889 9.63333333
3.04444444 8.42222222 6.13333333 5.97777778 5.8 6.14444444]
[ 2.8 4.66666667 7.2 7.86666667 8.4 9.4
2.95 8.9 7.7 8.23333333 8.43333333 6.73333333]
[ 2.3 0. 3.4 7.8 9. 10.
1. 10. 4.4 4. 2.8 5.4 ]]
plt.figure(1, figsize = (12, 8))
bar_plot(centros, datos.columns)
open_close_plot()
plt.figure(1, figsize = (12, 8))
bar_plot(centros, datos.columns, cluster=[3])
open_close_plot()
plt.figure(1, figsize = (12, 8))
bar_plot(centros, datos.columns, var=['Espacios Parqueo','Distribucion Productos','Calidad Servicio'])
open_close_plot()
plt.figure(1, figsize = (12, 8))
bar_plot(centros, datos.columns, cluster=[1,3], var=['Espacios Parqueo','Distribucion Productos','Calidad Servicio'])
open_close_plot()
plt.figure(1, figsize = (10, 10))
radar_plot(centros, datos.columns)
open_close_plot()